Please check below video before attempting this assignment
from IPython.display import YouTubeVideo
YouTubeVideo('ZhLXULFjIjQ', width="1000",height="500")
TF-IDFW2V
Tfidf w2v (w1,w2..) = (tfidf(w1) * w2v(w1) + tfidf(w2) * w2v(w2) + …) / (tfidf(w1) + tfidf(w2) + …)
(Optional) Please check course video on AVgw2V and TF-IDFW2V for more details.
Glove vectors
In this assignment you will be working with glove vectors , please check [this](https://en.wikipedia.org/wiki/GloVe_(machine_learning)) and [this](https://en.wikipedia.org/wiki/GloVe_(machine_learning)) for more details.
Download glove vectors from this link
#please use below code to load glove vectors
import pickle
with open('glove_vectors', 'rb') as f:
model = pickle.load(f)
glove_words = set(model.keys())
or else , you can use below code
'''
# Reading glove vectors in python: https://stackoverflow.com/a/38230349/4084039
def loadGloveModel(gloveFile):
print ("Loading Glove Model")
f = open(gloveFile,'r', encoding="utf8")
model = {}
for line in tqdm(f):
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
print ("Done.",len(model)," words loaded!")
return model
model = loadGloveModel('glove.42B.300d.txt')
# ============================
Output:
Loading Glove Model
1917495it [06:32, 4879.69it/s]
Done. 1917495 words loaded!
# ============================
words = []
for i in preproced_texts:
words.extend(i.split(' '))
for i in preproced_titles:
words.extend(i.split(' '))
print("all the words in the coupus", len(words))
words = set(words)
print("the unique words in the coupus", len(words))
inter_words = set(model.keys()).intersection(words)
print("The number of words that are present in both glove vectors and our coupus", \
len(inter_words),"(",np.round(len(inter_words)/len(words)*100,3),"%)")
words_courpus = {}
words_glove = set(model.keys())
for i in words:
if i in words_glove:
words_courpus[i] = model[i]
print("word 2 vec length", len(words_courpus))
# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/
import pickle
with open('glove_vectors', 'wb') as f:
pickle.dump(words_courpus, f)
'''
'\n# Reading glove vectors in python: https://stackoverflow.com/a/38230349/4084039\ndef loadGloveModel(gloveFile):\n print ("Loading Glove Model")\n f = open(gloveFile,\'r\', encoding="utf8")\n model = {}\n for line in tqdm(f):\n splitLine = line.split()\n word = splitLine[0]\n embedding = np.array([float(val) for val in splitLine[1:]])\n model[word] = embedding\n print ("Done.",len(model)," words loaded!")\n return model\nmodel = loadGloveModel(\'glove.42B.300d.txt\')\n\n# ============================\nOutput:\n \nLoading Glove Model\n1917495it [06:32, 4879.69it/s]\nDone. 1917495 words loaded!\n\n# ============================\n\nwords = []\nfor i in preproced_texts:\n words.extend(i.split(\' \'))\n\nfor i in preproced_titles:\n words.extend(i.split(\' \'))\nprint("all the words in the coupus", len(words))\nwords = set(words)\nprint("the unique words in the coupus", len(words))\n\ninter_words = set(model.keys()).intersection(words)\nprint("The number of words that are present in both glove vectors and our coupus", len(inter_words),"(",np.round(len(inter_words)/len(words)*100,3),"%)")\n\nwords_courpus = {}\nwords_glove = set(model.keys())\nfor i in words:\n if i in words_glove:\n words_courpus[i] = model[i]\nprint("word 2 vec length", len(words_courpus))\n\n\n# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/\n\nimport pickle\nwith open(\'glove_vectors\', \'wb\') as f:\n pickle.dump(words_courpus, f)\n\n\n'
with X-axis as min_sample_split, Y-axis as max_depth, and Z-axis as AUC Score , we have given the notebook which explains how to plot this 3d plot, you can find it in the same drive 3d_scatter_plot.ipynbor
seaborn heat maps with rows as min_sample_split, columns as max_depth, and values inside the cell representing AUC Score 

For this task consider set-1 features.
<img src='http://i.imgur.com/YVpIGGE.jpg' width=400px>
</li>
</ol>Hint for calculating Sentiment scores
# import nltk
# nltk.download('vader_lexicon')
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#import nltk
#nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()
sample_sentence_1='I am happy.'
ss_1 = sid.polarity_scores(sample_sentence_1)
print('sentiment score for sentence 1',ss_1)
sample_sentence_2='I am sad.'
ss_2 = sid.polarity_scores(sample_sentence_2)
print('sentiment score for sentence 2',ss_2)
sample_sentence_3='I am going to New Delhi tommorow.'
ss_3 = sid.polarity_scores(sample_sentence_3)
print('sentiment score for sentence 3',ss_3)
sentiment score for sentence 1 {'neg': 0.0, 'neu': 0.213, 'pos': 0.787, 'compound': 0.5719}
sentiment score for sentence 2 {'neg': 0.756, 'neu': 0.244, 'pos': 0.0, 'compound': -0.4767}
sentiment score for sentence 3 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
#make sure you are loading atleast 50k datapoints
#you can work with features of preprocessed_data.csv for the assignment.
import pandas
data = pandas.read_csv('preprocessed_data.csv',nrows=50000)
get_features=[]
#required modules
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
# write your code in following steps for task 1
# 1. calculate sentiment scores for the essay feature
# 2. Split your data.
# 3. perform tfidf vectorization of text data.
# 4. perform tfidf w2v vectorization of text data.
# 5. perform encoding of categorical features.
# 6. perform encoding of numerical features
# 7. For task 1 set 1 stack up all the features
# 8. For task 1 set 2 stack up all the features (for stacking dense features you can use np.stack)
# 9. Perform hyperparameter tuning and plot either heatmap or 3d plot.
# 10. Find the best parameters and fit the model. Plot ROC-AUC curve(using predict proba method)
# 11. Plot confusion matrix based on best threshold value
# 12. Find all the false positive data points and plot wordcloud of essay text and pdf of teacher_number_of_previously_posted_projects.
# 13. Write your observations about the wordcloud and pdf.
# please write all the code with proper documentation, and proper titles for each subsection
# go through documentations and blogs before you start coding
# first figure out what to do, and then think about how to do.
# reading and understanding error messages will be very much helpfull in debugging your code
# when you plot any graph make sure you use
# a. Title, that describes your plot, this will be very helpful to the reader
# b. Legends if needed
# c. X-axis label
# d. Y-axis label
data.columns
Index(['school_state', 'teacher_prefix', 'project_grade_category',
'teacher_number_of_previously_posted_projects', 'project_is_approved',
'clean_categories', 'clean_subcategories', 'essay', 'price'],
dtype='object')
y=data.project_is_approved
x=data.drop(columns='project_is_approved',axis=1)
y.value_counts()
#Imbalanced dataset
1 41993 0 8007 Name: project_is_approved, dtype: int64
# 2. Split your data.
X_train,X_test,Y_train,Y_test=train_test_split( x, y, test_size=0.2, random_state=42)
print(Y_train.value_counts())
1 33617 0 6383 Name: project_is_approved, dtype: int64
essay_sentiment_train=pd.DataFrame(columns=['neg','neu','pos','compound']) for i in range(Y_train.shape[0]): essay_sentiment_train=essay_sentiment_train.append(sid.polarity_scores(X_train.essay[i]),ignore_index=True) print(essay_sentiment_train.to_numpy())
get_features.extend(['neg','neu','pos','compound'])
X_=X_train.essay.map(lambda x:list(sid.polarity_scores(x).values()))
essay_sentiment_train=pd.DataFrame(columns=['neg','neu','pos','compound'])
for i in X_:
d={'neg':i[0],'neu':i[1],'pos':i[2],'compound':i[3]}
essay_sentiment_train=essay_sentiment_train.append(d,ignore_index=True)
X_=X_test.essay.map(lambda x:list(sid.polarity_scores(x).values()))
essay_sentiment_test=pd.DataFrame(columns=['neg','neu','pos','compound'])
for i in X_:
d={'neg':i[0],'neu':i[1],'pos':i[2],'compound':i[3]}
essay_sentiment_test=essay_sentiment_test.append(d,ignore_index=True)
## 3. perform tfidf vectorization of text data.
vectorizer = TfidfVectorizer(min_df=10,ngram_range=(1,4), max_features=5000)
vectorizer.fit(X_train['essay'].values)
get_features.extend(vectorizer.get_feature_names())
X_train_essay_Tfidf = vectorizer.transform(X_train['essay'].values).todense()
X_test_essay_Tfidf=vectorizer.transform(X_test['essay'].values).todense()
preprocessed_essays=X_train.essay
# average Word2Vec
# compute average word2vec for each review.
avg_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
avg_w2v_vectors.append(vector)
print(len(avg_w2v_vectors))
print(len(avg_w2v_vectors[0]))
100%|██████████████████████████████████████████████████████████████████████████| 40000/40000 [00:09<00:00, 4160.44it/s]
40000 300
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
tfidf_model = TfidfVectorizer()
tfidf_model.fit(preprocessed_essays)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
# average Word2Vec
# compute average word2vec for each review.
tfidf_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
tfidf_w2v_vectors.append(vector)
tfidf_w2v_vectors=tfidf_w2v_vectors
print(len(tfidf_w2v_vectors))
print(len(tfidf_w2v_vectors[0]))
100%|███████████████████████████████████████████████████████████████████████████| 40000/40000 [01:17<00:00, 514.53it/s]
40000 300
preprocessed_essays=X_test.essay
# average Word2Vec
# compute average word2vec for each review.
avg_w2v_vectors_test = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
avg_w2v_vectors_test.append(vector)
print(len(avg_w2v_vectors_test))
print(len(avg_w2v_vectors_test[0]))
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
tfidf_model = TfidfVectorizer()
tfidf_model.fit(preprocessed_essays)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
# average Word2Vec
# compute average word2vec for each review.
tfidf_w2v_vectors_test = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
tfidf_w2v_vectors_test.append(vector)
tfidf_w2v_vectors=tfidf_w2v_vectors
print(len(tfidf_w2v_vectors_test))
print(len(tfidf_w2v_vectors_test[0]))
100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 4114.36it/s]
10000 300
100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:19<00:00, 506.55it/s]
10000 300
#One hot encoding for school_state
vectorizer = CountVectorizer(binary=True)#one hot encoding
vectorizer.fit(X_train['school_state'].values) # fit has to happen only on train data
get_features.extend(vectorizer.get_feature_names())
# we use the fitted CountVectorizer to convert the text to vector
X_train_state_ohe = vectorizer.transform(X_train['school_state'].values).todense()
X_test_state_ohe = vectorizer.transform(X_test['school_state'].values).todense()
#one hot encoding for teacher_prefix
vectorizer = CountVectorizer(binary=True)#one hot encoding
vectorizer.fit(X_train['teacher_prefix'].values) # fit has to happen only on train data
get_features.extend(vectorizer.get_feature_names())
# we use the fitted CountVectorizer to convert the text to vector
X_train_teacher_ohe = vectorizer.transform(X_train['teacher_prefix'].values).todense()
X_test_teacher_ohe = vectorizer.transform(X_test['teacher_prefix'].values).todense()
#one hot encoding for project_grade_category
vectorizer = CountVectorizer(binary=True)#one hot encoding
vectorizer.fit(X_train['project_grade_category'].values) # fit has to happen only on train data
get_features.extend(vectorizer.get_feature_names())
# we use the fitted CountVectorizer to convert the text to vector
X_train_project_ohe = vectorizer.transform(X_train['project_grade_category'].values).todense()
X_test_project_ohe = vectorizer.transform(X_test['project_grade_category'].values).todense()
#one hot encoding for clean_categories
vectorizer = CountVectorizer(binary=True)#one hot encoding
vectorizer.fit(X_train['clean_categories'].values) # fit has to happen only on train data
get_features.extend(vectorizer.get_feature_names())
# we use the fitted CountVectorizer to convert the text to vector
X_train_clean_categories_ohe = vectorizer.transform(X_train['clean_categories'].values).todense()
X_test_clean_categories_ohe = vectorizer.transform(X_test['clean_categories'].values).todense()
#one hot encoding for clean_subcategories
vectorizer = CountVectorizer(binary=True)#one hot encoding
vectorizer.fit(X_train['clean_subcategories'].values) # fit has to happen only on train data
get_features.extend(vectorizer.get_feature_names())
# we use the fitted CountVectorizer to convert the text to vector
X_train_clean_subcategories_ohe = vectorizer.transform(X_train['clean_subcategories'].values).todense()
X_test_clean_subcategories_ohe = vectorizer.transform(X_test['clean_subcategories'].values).todense()
print(X_train_clean_subcategories_ohe.shape)
(40000, 30)
#Normalizating the price column
standard_vector1=Normalizer()
standard_vector1.fit(X_train.price.values.reshape(-1,1))
get_features.append('price')
X_train_price_norm=standard_vector1.transform(X_train.price.values.reshape(-1,1))
X_test_price_norm=standard_vector1.transform(X_test.price.values.reshape(-1,1))
print(X_train_price_norm.shape)
(40000, 1)
#Normalizating the teacher_number_of_previously_posted_projects column
standard_vector1=Normalizer()
standard_vector1.fit(X_train.teacher_number_of_previously_posted_projects.values.reshape(-1,1))
get_features.append('teacher_number_of_previously_posted_projects')
X_train_teacher_number_of_previously_posted_projects_norm=standard_vector1.transform(X_train.teacher_number_of_previously_posted_projects.values.reshape(-1,1))
X_test_teacher_number_of_previously_posted_projects_norm=standard_vector1.transform(X_test.teacher_number_of_previously_posted_projects.values.reshape(-1,1))
print(X_train_teacher_number_of_previously_posted_projects_norm.shape)
(40000, 1)
#Forming dataset1 with categorical, numerical features + preprocessed_essay (TFIDF) + Sentiment scores(preprocessed_essay)
dataset1_train=np.hstack((essay_sentiment_train.to_numpy(),X_train_essay_Tfidf,X_train_state_ohe,X_train_teacher_ohe,X_train_project_ohe,X_train_clean_categories_ohe,X_train_clean_subcategories_ohe,X_train_price_norm,X_train_teacher_number_of_previously_posted_projects_norm))
dataset1_test=np.hstack((essay_sentiment_test.to_numpy(),X_test_essay_Tfidf,X_test_state_ohe,X_test_teacher_ohe,X_test_project_ohe,X_test_clean_categories_ohe,X_test_clean_subcategories_ohe,X_test_price_norm,X_test_teacher_number_of_previously_posted_projects_norm))
print(dataset1_train.shape)
(40000, 5105)
#forming dataset2 with categorical, numerical features + preprocessed_essay (TFIDF W2V) + Sentiment scores(preprocessed_essay)
dataset2_train=np.hstack((essay_sentiment_train.to_numpy(),np.array(tfidf_w2v_vectors),X_train_state_ohe,X_train_teacher_ohe,X_train_project_ohe,X_train_clean_categories_ohe,X_train_clean_subcategories_ohe,X_train_price_norm,X_train_teacher_number_of_previously_posted_projects_norm))
dataset2_test=np.hstack((essay_sentiment_test.to_numpy(),np.array(tfidf_w2v_vectors_test),X_test_state_ohe,X_test_teacher_ohe,X_test_project_ohe,X_test_clean_categories_ohe,X_test_clean_subcategories_ohe,X_test_price_norm,X_test_teacher_number_of_previously_posted_projects_norm))
def find_AUC(min_sample,depth,x,y,X_test,Y_test):
AUC_score=[]
count=0
for i in min_sample:
for j in depth:
clf_model1=DecisionTreeClassifier(criterion='gini',min_samples_split=i,max_depth=j,splitter='random',class_weight='balanced')
clf_model1.fit(x,y)
y_pred=clf_model1.predict(X_test)
#Y_test_true=y[y==1]
#X_test_true=x[x==1]
count=count+1
print(count,end=' ')
AUC_score.append([i,j,roc_auc_score(Y_test,clf_model1.predict_proba(X_test)[:,1])])
return AUC_score
auc_values_dataset1=find_AUC([5, 10, 100, 500],[1, 3, 10, 30],dataset1_train,Y_train,dataset1_test,Y_test)
auc_values_dataset2=find_AUC([5, 10, 100, 500],[1, 3, 10, 30],dataset2_train,Y_train,dataset2_test,Y_test)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
import numpy as np
import matplotlib.pyplot as plt
x1 = np.array(auc_values_dataset1)[:,0]
y1 = np.array(auc_values_dataset1)[:,1]
z1 = np.array(auc_values_dataset1)[:,2]
x2 = np.array(auc_values_dataset2)[:,0]
y2 = np.array(auc_values_dataset2)[:,1]
z2 = np.array(auc_values_dataset2)[:,2]
# https://plot.ly/python/3d-axes/
trace1 = go.Scatter3d(x=x1,y=y1,z=z1, name = 'train')
trace2 = go.Scatter3d(x=x2,y=y2,z=z2, name = 'Cross validation')
data = [trace1, trace2]
layout = go.Layout(scene = dict(
xaxis = dict(title='n_estimators'),
yaxis = dict(title='max_depth'),
zaxis = dict(title='AUC'),))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='3d-scatter-colorscale')
fig.show()
plt.show()
def get_max_auc(ar):
AUC=0
min_sample=0
depth=0
for i in ar:
k,l,m= i
if AUC<m:
AUC=m
min_sample=k
depth=l
return min_sample,depth,AUC
sample_1,depth_1,AUC_1=get_max_auc(auc_values_dataset1)
print(f'minimum Sample split is {sample_1}, maximum depth is {depth_1}, and maximum AUC is {AUC_1} for dataset1')
sample_2,depth_2,AUC_2=get_max_auc(auc_values_dataset2)
print(f'minimum Sample split is {sample_2}, maximum depth is {depth_2}, and maximum AUC is {AUC_2} for dataset2 ')
minimum Sample split is 500, maximum depth is 10, and maximum AUC is 0.6139423540634512 for dataset1 minimum Sample split is 500, maximum depth is 10, and maximum AUC is 0.613218229071097 for dataset2
#building the model with the best_hyper parameters
clf_model1=DecisionTreeClassifier(criterion='gini',min_samples_split=sample_1,max_depth=depth_1,splitter='random',class_weight='balanced')
clf_model1.fit(dataset1_train,Y_train)
y_pred_train_1=clf_model1.predict(dataset1_train)
y_pred_test_1=clf_model1.predict(dataset1_test)
new_proba_train=clf_model1.predict_proba(dataset1_train)
new_proba_test=clf_model1.predict_proba(dataset1_test)
from sklearn.metrics import roc_curve
fpr,tpr,th=roc_curve(Y_train,new_proba_train[:,1])
fpr_t,tpr_t,th_t=roc_curve(Y_test,new_proba_test[:,1])
def get_max(fpr,tpr,th):
temp=0
ind=0
for i,j in zip(fpr,tpr):
temp1=abs(j*(1-i))
if temp<temp1:
temp=temp1
req=th[ind]
ind=ind+1
return req
print(f'Best thershold value for train data is {get_max(fpr,tpr,th)}')
print(f'Best thershold value for train data is {get_max(fpr_t,tpr_t,th_t)}')
Best thershold value for train data is 0.47784257591656776 Best thershold value for train data is 0.47784257591656776
# Plot the ROC-AUC curves using the probability predictions made on train and test data.
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
plt.plot(fpr,tpr,label='train')
plt.plot(fpr_t,tpr_t,label='test')
plt.legend()
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC_curve for dataset1')
plt.show()
print(f' For train data the Roc score is {roc_auc_score(Y_train,y_pred_train_1)}')
print(f' For Test data the Roc score is {roc_auc_score(Y_test,y_pred_test_1)}')
For train data the Roc score is 0.6195143809962275 For Test data the Roc score is 0.5770007316235456
y_pred_train = (clf_model1.predict_proba(dataset1_train)[:,1] >= get_max(fpr,tpr,th)).astype(bool)
y_pred_test = (clf_model1.predict_proba(dataset1_test)[:,1] >= get_max(fpr_t,tpr_t,th_t)).astype(bool)
con=confusion_matrix(Y_train,y_pred_train)
con_2=confusion_matrix(Y_test,y_pred_test)
print(f'The COnfusion Values tn , fp , fn , tp ')
print(f'Train confusin matrix {con.ravel()}')
print(f'Test confusin matrix {con_2.ravel()}')
The COnfusion Values tn , fp , fn , tp Train confusin matrix [ 4402 1981 15377 18240] Test confusin matrix [1018 606 3896 4480]
#Building model 2
#building the model with the best_hyper parameters min_samples_split=10,max_depth=1
clf_model2=DecisionTreeClassifier(criterion='gini',min_samples_split=sample_2,max_depth=depth_2,splitter='random',class_weight='balanced')
clf_model2.fit(dataset2_train,Y_train)
y_pred_train=clf_model2.predict(dataset2_train)
y_pred_test=clf_model2.predict(dataset2_test)
new_proba_train=clf_model2.predict_proba(dataset2_train)
new_proba_test=clf_model2.predict_proba(dataset2_test)
from sklearn.metrics import roc_curve
fpr,tpr,th=roc_curve(Y_train,new_proba_train[:,1])
fpr_t,tpr_t,th_t=roc_curve(Y_test,new_proba_test[:,1])
def get_max(fpr,tpr,th):
temp=0
ind=0
for i,j in zip(fpr,tpr):
temp1=abs(j*(1-i))
if temp<temp1:
temp=temp1
req=th[ind]
ind=ind+1
return req
print(f'Best thershold value for train data is {get_max(fpr,tpr,th)}')
print(f'Best thershold value for train data is {get_max(fpr_t,tpr_t,th_t)}')
Best thershold value for train data is 0.48232345298430185 Best thershold value for train data is 0.4756647060250733
plt.plot(fpr,tpr,label='train')
plt.plot(fpr_t,tpr_t,label='test')
plt.legend()
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC_curve For dataset2 ')
plt.show()
print(f' For train data the Roc score is {roc_auc_score(Y_train,y_pred_train)}')
print(f' For Test data the Roc score is {roc_auc_score(Y_test,y_pred_test)}')
For train data the Roc score is 0.6280152261764526 For Test data the Roc score is 0.5796863899200625
y_pred_train = (clf_model2.predict_proba(dataset2_train)[:,1] >= get_max(fpr,tpr,th)).astype(bool)
y_pred_test = (clf_model2.predict_proba(dataset2_test)[:,1] >= get_max(fpr_t,tpr_t,th_t)).astype(bool)
con=confusion_matrix(Y_train,y_pred_train)
con_2=confusion_matrix(Y_test,y_pred_test)
print(f'The COnfusion Values tn , fp , fn , tp ')
print(f'Train confusin matrix {con.ravel()}')
print(f'Test confusin matrix {con_2.ravel()}')
The COnfusion Values tn , fp , fn , tp Train confusin matrix [ 4120 2263 13155 20462] Test confusin matrix [ 883 741 3168 5208]
#Getting all the False Possitive points from X_test from model_1
false_possitive_data_points=X_test[(Y_test==0) & (y_pred_test_1!=Y_test)]
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
comment_words = ''
stopwords = set(STOPWORDS)
for val in false_possitive_data_points.essay:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
comment_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
print('The related plot is done from the source https://www.geeksforgeeks.org/generating-word-cloud-python/')
The related plot is done from the source https://www.geeksforgeeks.org/generating-word-cloud-python/
#Boxplot for price
from seaborn import boxplot
boxplot(x='price',data=false_possitive_data_points)
plt.title('Box plot for Price')
plt.show()
from seaborn import distplot
distplot(a=false_possitive_data_points['teacher_number_of_previously_posted_projects'],bins=10)
plt.title('PDF for `teacher_number_of_previously_posted_projects')
plt.show()
D:\fardata\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
# 1. write your code in following steps for task 2
# 2. select all non zero features
# 3. Update your dataset i.e. X_train,X_test and X_cv so that it contains all rows and only non zero features
# 4. perform hyperparameter tuning and plot either heatmap or 3d plot.
# 5. Fit the best model. Plot ROC AUC curve and confusion matrix similar to model 1.
#Building a new model to get features
clf=DecisionTreeClassifier()
clf.fit(dataset1_train,Y_train)
DecisionTreeClassifier()
#get all the features
t=clf.feature_importances_
t.shape
(5105,)
#picking Non zero column index
get_features=np.array(get_features)
new_t=np.where(t!=0)
get_features_names=get_features[new_t]
#Picking only required columns from the dataset1
dataset1_traint=pd.DataFrame(dataset1_train,columns=get_features)
dataset1_testt=pd.DataFrame(dataset1_test,columns=get_features)
new_dataset1_train=dataset1_traint.loc[:,get_features_names]
new_dataset1_test=dataset1_testt.loc[:,get_features_names]
#building a new model with new_dataset
#To build this model i am again going to use the Decision Trees
#Going to use RandomSearchCV to get best HyperParameters [5, 10, 100, 500],[1, 3, 10, 30]
clf=DecisionTreeClassifier()
parameters={'criterion':['gini','entropy'],
'splitter':['best','random'],
'min_samples_split':[5, 10, 100, 500],
'max_depth':[1, 3, 10, 30],
'max_features':['auto','sqrt','log2'],
'class_weight':['balanced']}
from sklearn.model_selection import RandomizedSearchCV
def random(est,para,x,y,n_iter):
rd=RandomizedSearchCV(est,para,n_iter=n_iter,n_jobs=-1,cv=10,random_state=10)
rd.fit(x,y)
best_score=rd.best_score_
best_params=rd.best_params_
print(best_score)
return best_params
d=random(clf,parameters,new_dataset1_train,Y_train,10)#Getting best hyperparameters and storing them in the d dictionary
0.635475
#Now biluding the model with the above parameters
clf=DecisionTreeClassifier(splitter=d['splitter'],min_samples_split=d['min_samples_split'],max_features=d['max_features'],criterion=d['criterion'],class_weight=d['class_weight'],max_depth=d['max_depth'])
clf.fit(new_dataset1_train,Y_train)
y_pred_train=clf.predict(new_dataset1_train)
y_pred_test=clf.predict(new_dataset1_test)
new_proba_train=clf.predict_proba(new_dataset1_train)
new_proba_test=clf.predict_proba(new_dataset1_test)
from sklearn.metrics import roc_curve
fpr,tpr,th=roc_curve(Y_train,new_proba_train[:,1])
fpr_t,tpr_t,th_t=roc_curve(Y_test,new_proba_test[:,1])
def get_max(fpr,tpr,th):
temp=0
ind=0
for i,j in zip(fpr,tpr):
temp1=abs(j*(1-i))
if temp<temp1:
temp=temp1
req=th[ind]
ind=ind+1
return req
print(f'Best thershold value for train data is {get_max(fpr,tpr,th)}')
print(f'Best thershold value for train data is {get_max(fpr_t,tpr_t,th_t)}')
Best thershold value for train data is 0.5017989927379716 Best thershold value for train data is 0.5017989927379716
plt.plot(fpr,tpr,label='train')
plt.plot(fpr_t,tpr_t,label='test')
plt.legend()
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC_curve For new_dataset1 ')
plt.show()
print(f' For train data the Roc score is {roc_auc_score(Y_train,y_pred_train)}')
print(f' For Test data the Roc score is {roc_auc_score(Y_test,y_pred_test)}')
For train data the Roc score is 0.503540828694605 For Test data the Roc score is 0.4992519090434316
y_pred_train = (clf.predict_proba(new_dataset1_train)[:,1] >= get_max(fpr,tpr,th)).astype(bool)
y_pred_test = (clf.predict_proba(new_dataset1_test)[:,1] >= get_max(fpr_t,tpr_t,th_t)).astype(bool)
con=confusion_matrix(Y_train,y_pred_train)
con_2=confusion_matrix(Y_test,y_pred_test)
print(f'The COnfusion Values tn , fp , fn , tp ')
print(f'Train confusin matrix {con.ravel()}')
print(f'Test confusin matrix {con_2.ravel()}')
The COnfusion Values tn , fp , fn , tp Train confusin matrix [ 124 6259 415 33202] Test confusin matrix [ 22 1602 126 8250]
print(f'The Number of correctly classified points in the test dataset are {(Y_test==y_test_pred).sum()}')
The Number of correctly classified points in the test dataset are 7276
# Tabulate your results
req_details=['Vectorizer','Model','Hyperparameter-1','Hyperparameter-2','AUC']
from prettytable import PrettyTable
x = PrettyTable()
x.field_names=req_details
x.add_row(['TFIDF','Brute',depth_1,sample_1,AUC_1])
x.add_row(['TFIDF w2v','Brute',depth_2,sample_2,AUC_2])
print(x)
+------------+-------+------------------+------------------+--------------------+ | Vectorizer | Model | Hyperparameter-1 | Hyperparameter-2 | AUC | +------------+-------+------------------+------------------+--------------------+ | TFIDF | Brute | 10 | 500 | 0.6139423540634512 | | TFIDF w2v | Brute | 10 | 500 | 0.613218229071097 | +------------+-------+------------------+------------------+--------------------+
#Summary
# From the above table i can say that the both TFIDF and TFIDF w2v have worked in the same manner.
# We can improve the performance of the model by using the model by using feature engineering
# There might be some other models which will fill fit well for this type of problem